# import dependencies for authentication
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
# import dependencies for eda
import numpy as np
import pandas as pd
import altair as alt
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import warnings
warnings.filterwarnings("ignore")
# read in liked songs
liked_songs = pd.read_csv('liked_songs.csv', index_col = [0])
liked_songs.shape
(4812, 23)
There are 4812 rows and 23 columns in this dataset. That's a lot of songs!
This already brings up some questions:
We've got some gears turning here. Let's start with the first two. We can use liked_songs.describe and the Spotify API dashboard to create a table of our columns names and a brief description.
# get column count, mean, std, etc.
liked_songs.describe()
| duration_s | popularity | acousticness | speechiness | key | liveness | instrumentalness | energy | tempo | time_signature | loudness | danceability | valence | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 | 4812.000000 |
| mean | 212.061016 | 44.401288 | 0.391558 | 0.075203 | 5.054239 | 0.170380 | 0.031551 | 0.528759 | 119.505147 | 3.892352 | -8.009924 | 0.576867 | 0.438185 | 2020.284289 | 7.111180 | 15.619077 |
| std | 53.039558 | 24.800363 | 0.337174 | 0.083933 | 3.517479 | 0.128426 | 0.146121 | 0.234326 | 30.490383 | 0.410889 | 4.124780 | 0.149988 | 0.226988 | 1.189322 | 3.717848 | 9.311429 |
| min | 34.050000 | 0.000000 | 0.000006 | 0.022500 | 0.000000 | 0.019300 | 0.000000 | 0.000280 | 39.120000 | 1.000000 | -39.995000 | 0.067600 | 0.029100 | 2017.000000 | 1.000000 | 1.000000 |
| 25% | 182.250500 | 29.000000 | 0.060300 | 0.033800 | 2.000000 | 0.097400 | 0.000000 | 0.348000 | 95.009750 | 4.000000 | -9.730000 | 0.479000 | 0.254000 | 2020.000000 | 4.000000 | 7.000000 |
| 50% | 207.762000 | 48.000000 | 0.309000 | 0.044400 | 5.000000 | 0.118000 | 0.000001 | 0.530000 | 118.065000 | 4.000000 | -7.156500 | 0.583000 | 0.412000 | 2020.000000 | 7.000000 | 15.000000 |
| 75% | 236.945250 | 64.000000 | 0.723000 | 0.074000 | 8.000000 | 0.193000 | 0.000151 | 0.715250 | 140.038250 | 4.000000 | -5.289000 | 0.684000 | 0.607000 | 2021.000000 | 11.000000 | 24.000000 |
| max | 780.439000 | 100.000000 | 0.996000 | 0.929000 | 11.000000 | 0.976000 | 0.973000 | 0.986000 | 218.365000 | 5.000000 | -1.148000 | 0.981000 | 0.974000 | 2022.000000 | 12.000000 | 31.000000 |
We can make a few observations here:
The average popularity of my songs is 44.45. With a popularity score range of 0 to 100, that's not very high at all! We should compare this with the distribution of popularity and the median.
My average song tempo is 119.55 which is classified as a moderate tempo.
# print the data tpe of each column
liked_songs.dtypes
id object name object artists object duration_s float64 popularity int64 added_at object acousticness float64 speechiness float64 key int64 liveness float64 instrumentalness float64 energy float64 tempo float64 time_signature int64 loudness float64 danceability float64 valence float64 genre_list object genre object pitch_class object year int64 month int64 day int64 dtype: object
Let's find the range of my dataset:
# print the first and last date in the dataset
print("The first date I added a song to my `liked` list was " + str(liked_songs.added_at.min()))
print("The last date I added a song to my `liked` list was " + str(liked_songs.added_at.max()))
The first date I added a song to my `liked` list was 2017-03-03 19:50:46+00:00 The last date I added a song to my `liked` list was 2022-12-31 18:17:00+00:00
Now, let's find how many songs I've added to my Liked Songs per year.
# how many songs per year?
liked_songs.year.value_counts()
2020 2111 2022 937 2021 840 2019 710 2017 208 2018 6 Name: year, dtype: int64
Now we'll address which artists I listen to the most
# my top ten added artists overall
top_songs = liked_songs.artists.value_counts()[:25]
top_songs = top_songs.reset_index()
top_songs.columns = ['artist', 'number_of_songs']
ax = sns.barplot(x = 'number_of_songs', y = 'artist', data = top_songs).set(title='Top Songs')
# ax.tick_params(axis='x', rotation=90)
Taylor Swift and Ed Sheeran have the greatest number of songs in my Liked Songs dataset. It would be interesting to see if this is true for this year and to compare the proportion of songs I have in my dataset/the number of total songs by that artist
# my top ten artists for 2022
top_songs_2022 = liked_songs[liked_songs.year == 2022]['artists'].value_counts()[:25]
top_songs_2022 = top_songs_2022.reset_index()
top_songs_2022.columns = ['artist', 'number_of_songs']
ax = sns.barplot(x = 'number_of_songs', y = 'artist', data = top_songs_2022).set(title='Top Songs 2022')
# ax.tick_params(axis='x', rotation=90)
Overall, I've added the most Taylor Swift, but in 2022, I've added the most 5 Seconds of Summer. It's also interesting to note that Ed Sheeran isn't in my top 10 for 2022, but he is in my top 10 overall.
# count of genres / what genres I listen to most often
liked_songs.genre.value_counts()[:20]
dance pop 777 pop 692 alt z 343 other 329 boy band 224 acoustic pop 144 canadian pop 125 canadian contemporary r&b 119 folk-pop 94 bedroom pop 92 modern rock 74 indie folk 69 neo mellow 69 nyc pop 56 art pop 54 indie pop 53 alternative r&b 53 adult standards 49 british soul 48 post-teen pop 44 Name: genre, dtype: int64
There's a lot of pop in there - completely expected. I can piece some of the other genres together, but alt z? So let's take a look at what kinds of songs are deemed alt z
liked_songs[liked_songs.genre == 'alt z'].head(5)
| id | name | artists | duration_s | popularity | added_at | acousticness | speechiness | key | liveness | ... | time_signature | loudness | danceability | valence | genre_list | genre | pitch_class | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2VFetGqLYq0Pc8ZtRYCaeL | In The Kitchen | Reneé Rapp | 227.749 | 73 | 2022-12-31 08:05:02+00:00 | 0.833 | 0.0504 | 0 | 0.2640 | ... | 3 | -6.578 | 0.364 | 0.125 | ['alt z', 'indie pop', 'pop'] | alt z | C/Do | 2022 | 12 | 31 |
| 2 | 2nZq5WQOW4FEPxCVTdNGfB | Kissin' In The Cold | JP Saxe | 224.083 | 73 | 2022-12-26 20:38:34+00:00 | 0.777 | 0.0360 | 0 | 0.1620 | ... | 4 | -7.620 | 0.528 | 0.342 | ['alt z', 'canadian contemporary r&b', 'neo me... | alt z | C/Do | 2022 | 12 | 26 |
| 55 | 0eQJy4VAW7AkhKIHzXx3jG | The First One | Astrid S | 188.649 | 44 | 2022-10-20 04:40:59+00:00 | 0.390 | 0.1600 | 7 | 0.1030 | ... | 4 | -8.304 | 0.538 | 0.183 | ['alt z', 'dance pop', 'norwegian pop', 'pop',... | alt z | G/Sol | 2022 | 10 | 20 |
| 56 | 46ydq5g3k17iLJs3qMDvO6 | Hurts So Good | Astrid S | 208.728 | 73 | 2022-10-20 04:40:55+00:00 | 0.084 | 0.0586 | 7 | 0.0957 | ... | 4 | -5.027 | 0.675 | 0.378 | ['alt z', 'dance pop', 'norwegian pop', 'pop',... | alt z | G/Sol | 2022 | 10 | 20 |
| 109 | 6BgOYYhN3yzY3GzaUv3b7T | Fun While It Lasted | Ashe | 146.436 | 46 | 2022-10-19 06:27:33+00:00 | 0.834 | 0.0316 | 9 | 0.1460 | ... | 4 | -7.590 | 0.494 | 0.219 | ['alt z', 'pop'] | alt z | A/La | 2022 | 10 | 19 |
5 rows × 23 columns
Let's check the popularity of the music I listen to.
liked_songs[['added_at','name', 'artists', 'popularity', 'genre']].sort_values('popularity', ascending=False)[:20]
| added_at | name | artists | popularity | genre | |
|---|---|---|---|---|---|
| 217 | 2022-09-23 05:42:14+00:00 | Unholy (feat. Kim Petras) | Sam Smith | 100 | pop |
| 4288 | 2019-12-25 08:39:13+00:00 | All I Want for Christmas Is You | Mariah Carey | 99 | dance pop |
| 46 | 2022-10-21 16:15:23+00:00 | Anti-Hero | Taylor Swift | 97 | pop |
| 162 | 2022-10-08 03:32:21+00:00 | I Ain't Worried | OneRepublic | 95 | piano rock |
| 0 | 2022-12-31 18:17:00+00:00 | Kill Bill | SZA | 95 | pop |
| 825 | 2022-04-01 00:18:10+00:00 | As It Was | Harry Styles | 94 | pop |
| 2324 | 2020-10-01 00:25:25+00:00 | Another Love | Tom Odell | 94 | chill pop |
| 643 | 2022-05-20 04:14:23+00:00 | As It Was | Harry Styles | 93 | pop |
| 4283 | 2019-12-25 08:39:25+00:00 | Snowman | Sia | 93 | australian dance |
| 4280 | 2019-12-25 08:39:32+00:00 | It's Beginning to Look a Lot like Christmas | Michael Bublé | 92 | adult standards |
| 4287 | 2019-12-25 08:39:16+00:00 | Mistletoe | Justin Bieber | 92 | canadian pop |
| 2425 | 2020-09-10 00:43:39+00:00 | Blinding Lights | The Weeknd | 92 | canadian contemporary r&b |
| 565 | 2022-05-24 05:26:10+00:00 | Late Night Talking | Harry Styles | 91 | pop |
| 388 | 2022-07-17 01:45:35+00:00 | Bad Habit | Steve Lacy | 90 | afrofuturism |
| 4720 | 2017-03-19 21:59:23+00:00 | Yellow | Coldplay | 90 | permanent wave |
| 42 | 2022-10-22 04:40:42+00:00 | Midnight Rain | Taylor Swift | 89 | pop |
| 2890 | 2020-05-24 16:43:30+00:00 | Heather | Conan Gray | 89 | bedroom pop |
| 45 | 2022-10-21 17:21:03+00:00 | Lavender Haze | Taylor Swift | 89 | pop |
| 4758 | 2017-03-11 23:48:38+00:00 | Perfect | Ed Sheeran | 89 | pop |
| 22 | 2022-11-24 02:43:16+00:00 | Something in the Orange | Zach Bryan | 89 | oklahoma country |
A couple of names stand out here: Taylor Swift, Harry Styles, and Doja Cat because as far as general popularity as well as TikTok Popularity, these are pretty high up on the list. Most of the top 20 songs in my dataset based on popularity are within the pop genre as well.
That goes for Sam Smith's song 'Unholy' as well which has a popularity score of 100.
len(liked_songs[liked_songs.popularity > 79])*100/len(liked_songs)
4.530340814630091
Only 4.53% of the songs I listen to have a popularity above 79%.
sns.histplot(data=liked_songs, x="popularity").set(title='Song Popularity');
I was also curious about the number of songs I saved that have the same name.
sns.histplot(data=liked_songs[liked_songs.year == 2022], x="popularity", color = 'pink');
The distribution of my songs is very roughly normal, barring the songs with a 0 popularity score. Let's see how many songs have 0 popularity.
# zero popularity songs
zero_pop = liked_songs[liked_songs.popularity == 0]
# number of songs with popularity of 0
len(zero_pop)
550
# number of artists that have songs with 0 popularity
zero_pop.artists.nunique()
252
liked_songs.popularity.mean()
44.40128844555279
liked_songs.popularity.median()
48.0
These seem to be somewhat older songs in the dataset or what would be considered 'party' songs.
# looking at the name frequency of the songs I've added
liked_songs.name.value_counts()[:20]
She 6 Daylight 6 Memories 5 Trouble 5 Hurricane 5 Boyfriend 5 One 4 Somebody Else 4 Enchanted 4 Shapeshifter 4 Invisible 4 Wolves 4 Ghosts 4 Golden 4 Waiting 4 Runaway 4 I'm Yours 4 The City 4 Roses 4 Colors 4 Name: name, dtype: int64
Does this say anything about the types of songs I enjoy listening to?
liked_songs[liked_songs.name == "She"]
| id | name | artists | duration_s | popularity | added_at | acousticness | speechiness | key | liveness | ... | time_signature | loudness | danceability | valence | genre_list | genre | pitch_class | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1552 | 56XCgjSYaLg0TjxkVSpqNu | She | Ed Sheeran | 244.653 | 37 | 2021-03-31 03:24:22+00:00 | 0.811000 | 0.0361 | 9 | 0.1300 | ... | 3 | -10.910 | 0.573 | 0.3770 | ['pop', 'uk pop'] | pop | A/La | 2021 | 3 | 31 |
| 2000 | 63o0CBXZ6LAnJoqz26oRj9 | She | Meyru | 334.608 | 0 | 2020-11-04 04:17:04+00:00 | 0.533000 | 0.0348 | 11 | 0.3400 | ... | 4 | -9.908 | 0.569 | 0.0408 | ['other'] | other | B/Si | 2020 | 11 | 4 |
| 2549 | 1z1ztKUrDr09ZSMDnN3QIG | She | Selena Gomez | 172.999 | 56 | 2020-08-28 02:39:22+00:00 | 0.277000 | 0.0481 | 2 | 0.0672 | ... | 4 | -6.686 | 0.783 | 0.7230 | ['dance pop', 'pop', 'post-teen pop'] | dance pop | D/Re | 2020 | 8 | 28 |
| 2951 | 5hwzzutxeqeC5VMSpnfgul | She | Selena Gomez | 172.999 | 27 | 2020-05-06 00:27:17+00:00 | 0.286000 | 0.0485 | 2 | 0.0682 | ... | 4 | -6.687 | 0.784 | 0.7280 | ['dance pop', 'pop', 'post-teen pop'] | dance pop | D/Re | 2020 | 5 | 6 |
| 3375 | 6SQLk9HSNketfgs2AyIiMs | She | Harry Styles | 362.653 | 75 | 2020-03-23 01:37:17+00:00 | 0.000532 | 0.0272 | 0 | 0.1900 | ... | 3 | -5.942 | 0.535 | 0.4570 | ['pop'] | pop | C/Do | 2020 | 3 | 23 |
| 4770 | 3ICdPHubhqTJ4Lm9NEb2W3 | She | Ed Sheeran | 244.653 | 47 | 2017-03-11 21:45:28+00:00 | 0.811000 | 0.0361 | 9 | 0.1300 | ... | 3 | -10.910 | 0.573 | 0.3770 | ['pop', 'uk pop'] | pop | A/La | 2017 | 3 | 11 |
6 rows × 23 columns
# Let's remove these duplicates: FIXME this should actually go in preprocessesing
liked_songs_nodup = liked_songs.drop_duplicates(
subset = ['name', 'artists'],
keep = 'first').reset_index(drop = True)
liked_songs_nodup.shape
(4402, 23)
# I'm going to change it to liked_songs because it's long
liked_songs = liked_songs_nodup
Now that we've done some initial exploring, we'll create some visuals.
audio_features = liked_songs[[ 'acousticness', 'speechiness', 'key', 'liveness',
'instrumentalness', 'energy', 'tempo', 'time_signature',
'loudness', 'danceability','valence',
]]
for col in audio_features:
plt.figure(figsize=(20,2))
sns.histplot(data=audio_features, x=col)
# change the color for each histplot?
for col in audio_features:
plt.figure(figsize=(17,1))
sns.boxplot(data=audio_features, x=col)
# find the top 6 genres in the dataset
liked_songs.genre.value_counts().head(6).axes
[Index(['dance pop', 'pop', 'other', 'alt z', 'boy band', 'acoustic pop'], dtype='object')]
# liked_songs[liked_songs.genre in top_six_genres]
top_six_genres = liked_songs[liked_songs.genre.isin(['dance pop', 'pop', 'alt z', 'other', 'boy band', 'acoustic pop'])]
sns.boxplot(data=top_six_genres, x="acousticness", y="genre");
Okay, we can loop through and do this for all audio features if we want
for col in audio_features:
plt.figure(figsize=(21,2))
sns.boxplot(data=top_six_genres, x=col, y="genre", palette="Spectral");
# running the violin plot on top 6 genres
sns.violinplot(x=top_six_genres["genre"], y=top_six_genres["popularity"]);
# create heatmap for correlation
plt.figure(figsize=(15, 10))
corr = liked_songs.corr()
sns.heatmap(corr, annot=True, cmap="coolwarm").set_title('Pearson correlation matrix')
plt.show()
# select the acoustiness and energy, order by acoustiness
liked_songs.sort_values(by=['acousticness'],ascending=False)[['name', 'artists', 'acousticness', 'energy']]
| name | artists | acousticness | energy | |
|---|---|---|---|---|
| 4145 | The Slow Calm | Jon E. Amber | 0.996000 | 0.00028 |
| 825 | Rêverie, L. 68 | Claude Debussy | 0.995000 | 0.00904 |
| 1105 | Invention No. 9 in F Minor, BWV 780 | Johann Sebastian Bach | 0.995000 | 0.03800 |
| 4117 | Shir Eres (Lullaby) | Avishai Cohen | 0.995000 | 0.01570 |
| 1341 | Songs Without Words, Op. 19b: No. 6, Andante s... | Felix Mendelssohn | 0.995000 | 0.00513 |
| ... | ... | ... | ... | ... |
| 4204 | Smells Like Teen Spirit | Nirvana | 0.000025 | 0.91200 |
| 3300 | Today | The Smashing Pumpkins | 0.000025 | 0.85500 |
| 4203 | Spoonman | Soundgarden | 0.000022 | 0.90200 |
| 571 | The Only Reason | 5 Seconds of Summer | 0.000008 | 0.80800 |
| 3042 | Talk Is Cheap | Miley Cyrus | 0.000006 | 0.88400 |
4402 rows × 4 columns
# select energy and loudness, order by energy
liked_songs.sort_values(by=['loudness'],ascending=False)[['name', 'artists', 'energy', 'loudness']]
| name | artists | energy | loudness | |
|---|---|---|---|---|
| 255 | The Difference | Flume | 0.86500 | -1.148 |
| 3409 | Show Me What I'm Looking For | Carolina Liar | 0.86400 | -1.159 |
| 3467 | We Made You | Eminem | 0.85300 | -1.203 |
| 1893 | Fighter | Christina Aguilera | 0.92000 | -1.357 |
| 3898 | Give It Up (feat. Elizabeth Gillies & Ariana G... | Victorious Cast | 0.94400 | -1.541 |
| ... | ... | ... | ... | ... |
| 1585 | Gnossiennes: No. 3, Lent | Erik Satie | 0.01400 | -35.174 |
| 4150 | 6 Consolations, S. 172: No. 3 in D-Flat Major ... | Franz Liszt | 0.00363 | -36.759 |
| 4152 | Gymnopédie No. 1 | Erik Satie | 0.01270 | -36.856 |
| 1341 | Songs Without Words, Op. 19b: No. 6, Andante s... | Felix Mendelssohn | 0.00513 | -37.371 |
| 4288 | 3 Gymnopedies (arr. A. Miolin): Gymnopedie No. 1 | Anders Miolin | 0.00356 | -39.995 |
4402 rows × 4 columns
# get max loudness
liked_songs.loudness.max()
-1.148
# printing out correlation table
corr
| duration_s | popularity | acousticness | speechiness | key | liveness | instrumentalness | energy | tempo | time_signature | loudness | danceability | valence | year | month | day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration_s | 1.000000 | 0.031815 | 0.034446 | -0.132096 | -0.003346 | 0.005957 | 0.061642 | -0.080983 | -0.011011 | -0.056028 | -0.081418 | -0.240539 | -0.274287 | -0.076360 | 0.030608 | -0.015812 |
| popularity | 0.031815 | 1.000000 | -0.171208 | 0.032150 | -0.013396 | -0.043031 | -0.098013 | 0.147871 | 0.031145 | 0.019090 | 0.155916 | 0.107498 | 0.081278 | 0.098730 | 0.040239 | 0.057228 |
| acousticness | 0.034446 | -0.171208 | 1.000000 | -0.118135 | -0.013288 | -0.102350 | 0.241087 | -0.802025 | -0.189376 | -0.191801 | -0.648510 | -0.311816 | -0.428647 | -0.075358 | -0.038523 | -0.096997 |
| speechiness | -0.132096 | 0.032150 | -0.118135 | 1.000000 | 0.043645 | 0.110887 | -0.054260 | 0.140477 | 0.072261 | 0.045833 | 0.066252 | 0.194771 | 0.189573 | -0.065093 | 0.100805 | -0.000361 |
| key | -0.003346 | -0.013396 | -0.013288 | 0.043645 | 1.000000 | 0.022326 | -0.030697 | 0.014709 | 0.019293 | -0.002465 | 0.011437 | 0.031150 | 0.033060 | 0.003758 | -0.011192 | -0.024308 |
| liveness | 0.005957 | -0.043031 | -0.102350 | 0.110887 | 0.022326 | 1.000000 | -0.053982 | 0.145014 | 0.037430 | 0.017698 | 0.092098 | -0.013642 | 0.104309 | -0.010005 | -0.000381 | 0.012292 |
| instrumentalness | 0.061642 | -0.098013 | 0.241087 | -0.054260 | -0.030697 | -0.053982 | 1.000000 | -0.263983 | -0.073373 | -0.088275 | -0.549775 | -0.272730 | -0.169129 | -0.055967 | 0.043692 | -0.094861 |
| energy | -0.080983 | 0.147871 | -0.802025 | 0.140477 | 0.014709 | 0.145014 | -0.263983 | 1.000000 | 0.224378 | 0.213425 | 0.777324 | 0.269271 | 0.540316 | 0.041201 | 0.020076 | 0.117822 |
| tempo | -0.011011 | 0.031145 | -0.189376 | 0.072261 | 0.019293 | 0.037430 | -0.073373 | 0.224378 | 1.000000 | -0.016496 | 0.177407 | -0.092420 | 0.102179 | 0.034660 | 0.005372 | 0.029178 |
| time_signature | -0.056028 | 0.019090 | -0.191801 | 0.045833 | -0.002465 | 0.017698 | -0.088275 | 0.213425 | -0.016496 | 1.000000 | 0.185973 | 0.197015 | 0.161807 | -0.000262 | 0.018368 | 0.037414 |
| loudness | -0.081418 | 0.155916 | -0.648510 | 0.066252 | 0.011437 | 0.092098 | -0.549775 | 0.777324 | 0.177407 | 0.185973 | 1.000000 | 0.325456 | 0.411620 | 0.046350 | -0.001727 | 0.124057 |
| danceability | -0.240539 | 0.107498 | -0.311816 | 0.194771 | 0.031150 | -0.013642 | -0.272730 | 0.269271 | -0.092420 | 0.197015 | 0.325456 | 1.000000 | 0.503276 | 0.036595 | 0.034106 | 0.017247 |
| valence | -0.274287 | 0.081278 | -0.428647 | 0.189573 | 0.033060 | 0.104309 | -0.169129 | 0.540316 | 0.102179 | 0.161807 | 0.411620 | 0.503276 | 1.000000 | 0.051700 | 0.000238 | 0.043327 |
| year | -0.076360 | 0.098730 | -0.075358 | -0.065093 | 0.003758 | -0.010005 | -0.055967 | 0.041201 | 0.034660 | -0.000262 | 0.046350 | 0.036595 | 0.051700 | 1.000000 | -0.167197 | -0.074223 |
| month | 0.030608 | 0.040239 | -0.038523 | 0.100805 | -0.011192 | -0.000381 | 0.043692 | 0.020076 | 0.005372 | 0.018368 | -0.001727 | 0.034106 | 0.000238 | -0.167197 | 1.000000 | 0.102382 |
| day | -0.015812 | 0.057228 | -0.096997 | -0.000361 | -0.024308 | 0.012292 | -0.094861 | 0.117822 | 0.029178 | 0.037414 | 0.124057 | 0.017247 | 0.043327 | -0.074223 | 0.102382 | 1.000000 |
corr.min()
duration_s -0.274287 popularity -0.171208 acousticness -0.802025 speechiness -0.132096 key -0.030697 liveness -0.102350 instrumentalness -0.549775 energy -0.802025 tempo -0.189376 time_signature -0.191801 loudness -0.648510 danceability -0.311816 valence -0.428647 year -0.167197 month -0.167197 day -0.096997 dtype: float64
corr.max()
duration_s 1.0 popularity 1.0 acousticness 1.0 speechiness 1.0 key 1.0 liveness 1.0 instrumentalness 1.0 energy 1.0 tempo 1.0 time_signature 1.0 loudness 1.0 danceability 1.0 valence 1.0 year 1.0 month 1.0 day 1.0 dtype: float64
Correlation values range from -1 to 1. The closer a correlation value is to 1 (positive or negative), the stronger the correlation between the two variables. Variables with a strong positive correlation increase together, whereas variables with a strong negative correlation experience opposing polarization (as one goes up, the other goes down and vice versa). The closer the correlation is to 0, the weaker the correlation.
energy and acousticness at -0.8. speechiness and dat at -0.00036Overall, none of our variables are very strongly correlated.
# we can also create a pairplot of the correlations
sns.pairplot(liked_songs);
Audio Features
audio_descp = liked_songs.copy()
audio_descp = audio_descp.drop(columns=['year', 'month', 'day'])
The pairplot above is really crowded because it was run on all of the columns in our dataset. It shows the pairwise relationship between each column - so it's similar to the heat map but instead of correlation, it's the data plots of x against y where x is the first column of interest and y is the second column of interest.
# we can also create a pairplot of the correlations
sns.pairplot(audio_descp)
<seaborn.axisgrid.PairGrid at 0x18bf99a3100>
# find the closest corr for each col
corr.columns
corr.iloc[:,0] # everything for first col
This is just a visual confirmation of how popular my music tastes are. The majority of my music tastes fall in the mid-range in terms of popularity
# mean popularity score value
liked_songs.popularity.mean()
What about the popularity in 2022?
liked_songs[liked_songs.year == 2022].popularity.mean()
It'll be interesting to see how the popularity of my songs has changed over the years.
# histogram of duration in seconds
sns.histplot(liked_songs.duration_s, bins=25);
(liked_songs.duration_s.mean())/60
Most of my saved songs are around 3 minutes long.
# longest song in my saved
liked_songs.iloc[ liked_songs['duration_s'].idxmax() ][['artists','name']]
# shortest song in my saved
liked_songs.iloc[liked_songs['duration_s'].idxmin() ][['artists','name']]
liked_songs_2022 = liked_songs[liked_songs.year > 2021]
print(str(len(liked_songs_2022)) + " songs have been added in 2022")
liked_songs_2022.head(5)
sns.scatterplot(data=liked_songs_2022, x='energy', y='acousticness');
Above is the visualization of we saw earlier in the heatmap. As energy increases, the amount of acousticness the song has tends to decrease. And we can fit this with linear regression to point out an even cleaner pattern.
sns.regplot(data=liked_songs_2022, x='energy', y='acousticness');
How have the audio features of my song choices changed over time? We can write a time series function to plot the change of our data over timie
def plot_time_series(col_name, title, rolling_window_days):
daily_series = pd.Series(data=np.array(liked_songs[col_name]),
name=col_name,
index=liked_songs['added_at']).sort_index()
(daily_series.rolling(window = rolling_window_days)
.mean()
.plot(figsize=(30, 10))
.set(xlabel='date (by day)', ylabel=col_name, title=title))
plt.show()
plot_time_series('popularity', 'Popularity over time (window = 30 days)', 30)
plot_time_series('duration_s', 'Duration (s) over time (window = 30 days)', 30)
plot_time_series('danceability', 'Danceability over time (window = 30 days)', 30)
plot_time_series('valence', 'Valence over time (window = 30 days)', 30)
plot_time_series('energy', 'Energy over time (window = 30 days)', 30)
plot_time_series('tempo', 'Tempo over time (window = 30 days)', 30)
In general, our plots above are fairly stationary from 2020 on.
# plt.figure(figsize=(15, 10))
sns.boxplot(x=liked_songs['valence']).set_title('Tracks valence (1 = happy, 0 = sad)')
plt.show()
# plt.figure(figsize=(15, 10))
sns.boxplot(x=liked_songs['loudness']).set_title('Loudness')
plt.show()
# plt.figure(figsize=(15, 10))
sns.boxplot(x=liked_songs['tempo']).set_title('Tempo')
plt.show()
# plt.figure(figsize=(15, 10))
sns.boxplot(x=liked_songs['energy']).set_title('Energy')
plt.show()
# !pip install yellowbrick
from yellowbrick.target import FeatureCorrelation
# define columns to select
feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness',
'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_s','key','year']
X, y = liked_songs_2022[feature_names], liked_songs_2022['popularity']
# Create a list of the feature names
features = np.array(feature_names)
# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)
plt.rcParams['figure.figsize']=(6,6)
visualizer.fit(X, y) # Fit the data to the visualizer
visualizer.show();
Make this a graph instead
| Year | Number of songs |
|---|---|
| 2020 | 2112 |
| 2022 | 907 |
| 2021 | 842 |
| 2019 | 711 |
| 2017 | 208 |
| 2018 | 6 |
Taylor Swift is the artist I have added the greatest number of songs for over the years. How did my addition of Taylor Swift songs change throughout the years, or did it? Note: We'll want to pay attention to album releases since she took a brief hiatus and I will often add her songs immediately when an album is released It might also be interesting to see if I add the songs the same year the song is released or later on.
liked_songs['year'] = pd.DatetimeIndex(liked_songs['added_at']).year
liked_songs['month'] = pd.DatetimeIndex(liked_songs['added_at']).month
liked_songs['day'] = pd.DatetimeIndex(liked_songs['added_at']).day
taylor_swift = liked_songs[liked_songs.artists == 'Taylor Swift']
taylor_swift
sns.countplot(data=taylor_swift, y="year",hue='pitch_class');
sns.countplot(data=taylor_swift, y="year",hue='month');
sns.countplot(data=liked_songs, x="month", hue='year');
Maybe use Altair instead to put these side by side per year
t_swizzle = liked_songs[liked_songs.artists == 'Taylor Swift']
liked_songs.columns
sns.scatterplot(data=t_swizzle, x='year', y='popularity');
plt.xticks(rotation=85);
Change the above to a floating boxplot
sns.boxplot(data=t_swizzle, y='year', x='popularity');
plt.xticks(rotation=90);
taylor_swift = liked_songs[liked_songs.artists =='Taylor Swift']
taylor_swift.columns
taylor_swift.key.unique()
sns.histplot(data=taylor_swift, x="popularity", hue='key');
Should I compare the data for my top artist this year vs last year?
Most of TSwizzle's songs fall in the popularity of around 65-70
I want to see what the most and least popular song (tswizzle and otherwise) per year that I listen to per year
Further visualization:
# getting the most popular tracks in the dataset
liked_songs.sort_values(by=['popularity'], ascending=False)[['name', 'artists']].head(20)
# top 20 genres by count
top_20c = pd.DataFrame(liked_songs['genre'].value_counts().head(20)).reset_index()
top_20c.set_axis(['genre', 'count'], inplace=True, axis=1)
sns.barplot(data=top_20c, y='genre', x='count').set(title='Number of Tracks By Genre (Top 20)');
# popularity of the songs
# get the average popularity of the genres
top_20 = liked_songs.groupby('genre').mean().sort_values(by='popularity', ascending=False).head(20).reset_index()
top_20
sns.barplot(data=top_20, y='genre', x='popularity').set(title='Popularity of Tracks By Genre (Top 20)');
sns.set(rc = {'figure.figsize':(20,20)})
sns.jointplot(data=liked_songs, x="loudness", y="energy", kind="kde");
!jupyter nbconvert eda.ipynb --to html --output eda_code.html